library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.2.2Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.4.0 ✔ purrr 0.3.4
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2 Warning: package ‘ggplot2’ was built under R version 4.2.2── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
library(caret)
Warning: package ‘caret’ was built under R version 4.2.2Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(broom)
Warning: package ‘broom’ was built under R version 4.2.2
library(fastDummies)
Warning: package ‘fastDummies’ was built under R version 4.2.2
library(GGally)
Warning: package ‘GGally’ was built under R version 4.2.2Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(ggfortify)
Warning: package ‘ggfortify’ was built under R version 4.2.2
library(mosaic)
Warning: package ‘mosaic’ was built under R version 4.2.2Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Attaching package: ‘mosaic’
The following object is masked from ‘package:Matrix’:
mean
The following object is masked from ‘package:caret’:
dotPlot
The following objects are masked from ‘package:dplyr’:
count, do, tally
The following object is masked from ‘package:purrr’:
cross
The following object is masked from ‘package:ggplot2’:
stat
The following objects are masked from ‘package:stats’:
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test, quantile, sd, t.test, var
The following objects are masked from ‘package:base’:
max, mean, min, prod, range, sample, sum
library(mosaicData)
library(modelr)
Warning: package ‘modelr’ was built under R version 4.2.2
Attaching package: ‘modelr’
The following object is masked from ‘package:mosaic’:
resample
The following object is masked from ‘package:ggformula’:
na.warn
The following object is masked from ‘package:broom’:
bootstrap
library(relaimpo)
Warning: package ‘relaimpo’ was built under R version 4.2.2Loading required package: MASS
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Loading required package: boot
Attaching package: ‘boot’
The following object is masked from ‘package:mosaic’:
logit
The following object is masked from ‘package:lattice’:
melanoma
Loading required package: survey
Warning: package ‘survey’ was built under R version 4.2.2Loading required package: grid
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:boot’:
aml
The following object is masked from ‘package:caret’:
cluster
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
Loading required package: mitools
Warning: package ‘mitools’ was built under R version 4.2.2This is the global version of package relaimpo.
If you are a non-US user, a version with the interesting additional metric pmvd is available
from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
library(glmulti)
Warning: package ‘glmulti’ was built under R version 4.2.2Loading required package: rJava
Loading required package: leaps
Warning: package ‘leaps’ was built under R version 4.2.2
avocados <- read_csv(here::here("weekend/data/avocado.csv")) %>% clean_names()
New names:Rows: 18249 Columns: 14── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): type, region
dbl (11): ...1, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, year
date (1): Date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
avocados %>%
distinct(region)
avocados %>%
filter(region == "TotalUS",
date == "2015-12-27")
avocados %>%
filter(region %in% c("Midsouth", "Northeast", "Plains", "SouthCentral", "Southeast", "West", "GreatLakes", "California"),
date == "2015-12-27") %>%
group_by(type) %>%
summarise(total_volume = sum(total_volume),
x4046 = sum(x4046),
total_bags = sum(total_bags))
avocados %>%
filter(!region %in% c("TotalUS", "Midsouth", "Northeast", "Plains", "SouthCentral", "Southeast", "West", "GreatLakes", "California"),
date == "2015-12-27") %>%
group_by(type) %>%
summarise(total_volume = sum(total_volume),
x4046 = sum(x4046),
total_bags = sum(total_bags))
# Missouth, Northeast, Plains, SouthCentral, Southeast, West, GreatLakes, California
# These are the regions that make up the total US. Above code shows that the total numbers of the categories are the same
# I will filter the data so that it only uses the lines for the above regions to avoid double counting.
# no need to worry about the smaller areas as these are contained within the bigger regions
# also i dont think it makes sense to look at regions that small
# also by the looks of it the remaining cities arent an exhaustive list
# might be good to extract month from date as a categorical
# proportions of 4046, 4225, 4770 may be better than absolutes, also props of bag size may be better than absolute numbers
avocados %>%
filter(region %in% c("Midsouth",
"Northeast",
"Plains",
"SouthCentral",
"Southeast",
"West",
"GreatLakes",
"California")) %>%
mutate(a = round(small_bags + large_bags + x_large_bags, 2) - round(total_bags, 2)) %>%
arrange(desc(a))
avocados
alias(lm(average_price ~ ., data = avocados))
Model :
average_price ~ x1 + date + total_volume + x4046 + x4225 + x4770 +
total_bags + small_bags + large_bags + x_large_bags + type +
year + region
# for some reason the bag sizes dont properly add up to the total_bags
# without the round they are different by like 0.000000000001
# with the round there are a couple at either end out by +-1
# this means they dont show up in alias - I'm going to manual remove x larges bags as this can be derived from the other 3
# x4046, x4225 and x 4770 dont add up to the total volume so can keep them all
avocados <- avocados %>% filter(region %in% c("Midsouth",
"Northeast",
"Plains",
"SouthCentral",
"Southeast",
"West",
"GreatLakes",
"California")) %>%
mutate(month = as.factor(month(date)),
across(where(is.character), as.factor),
year = as.factor(year)) %>%
dplyr::select(-x1, -date, -x_large_bags)
n_data <- nrow(avocados)
test_index <- sample(1:n_data, size = n_data * 0.2)
test <- avocados %>%
slice(test_index)
train <- avocados %>%
slice(-test_index)
lm1 <- lm(average_price ~ total_volume, train)
summary(lm1)
Call:
lm(formula = average_price ~ total_volume, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.67197 -0.17047 -0.01404 0.14308 0.99220
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.596e+00 7.024e-03 227.20 <2e-16 ***
total_volume -1.055e-07 2.167e-09 -48.68 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2424 on 2162 degrees of freedom
Multiple R-squared: 0.5229, Adjusted R-squared: 0.5227
F-statistic: 2369 on 1 and 2162 DF, p-value: < 2.2e-16
plotModel(lm1)
plot(lm1)
# diagnostocs are mad i think this is because of the organic type
avocados_resid <- train %>%
add_residuals(lm1) %>%
dplyr::select(-average_price, -total_volume) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
Please use wrap predicates in `where()` instead.
# Was:
data %>% select(is.factor)
# Now:
data %>% select(where(is.factor))
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
Please use wrap predicates in `where()` instead.
# Was:
data %>% select(is.numeric)
# Now:
data %>% select(where(is.numeric))
train %>%
ggplot(aes(x = x4046, y = average_price, colour = type)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# filter by type to check if the are correlated with different things
avocados %>%
filter(type == "conventional") %>%
dplyr::select(average_price, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados %>%
filter(type == "conventional") %>%
dplyr::select(is.numeric) %>%
ggpairs()
avocados %>%
filter(type == "organic") %>%
dplyr::select(average_price, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados %>%
filter(type == "organic") %>%
dplyr::select(is.numeric) %>%
ggpairs()
# total volume and total bags strongest
# going to do type first
lm2 <- lm(average_price ~ type, train)
summary(lm2)
Call:
lm(formula = average_price ~ type, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.70986 -0.16986 -0.01986 0.15490 0.97014
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.125101 0.007699 146.13 <2e-16 ***
typeorganic 0.484760 0.010909 44.44 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2537 on 2162 degrees of freedom
Multiple R-squared: 0.4774, Adjusted R-squared: 0.4771
F-statistic: 1975 on 1 and 2162 DF, p-value: < 2.2e-16
plot(lm2)
NA
NA
avocados_resid <- train %>%
add_residuals(lm2) %>%
dplyr::select(-average_price, -type) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm3 <- lm(average_price ~ type + region, train)
summary(lm3)
Call:
lm(formula = average_price ~ type + region, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.68255 -0.13691 -0.02448 0.10986 1.00801
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1669791 0.0141303 82.587 < 2e-16 ***
typeorganic 0.4830186 0.0092509 52.213 < 2e-16 ***
regionGreatLakes -0.0674479 0.0186557 -3.615 0.000307 ***
regionMidsouth 0.0001922 0.0186725 0.010 0.991786
regionNortheast 0.1899355 0.0188109 10.097 < 2e-16 ***
regionPlains 0.0287362 0.0185436 1.550 0.121370
regionSouthCentral -0.3124954 0.0186231 -16.780 < 2e-16 ***
regionSoutheast -0.0152272 0.0187933 -0.810 0.417888
regionWest -0.1380063 0.0185090 -7.456 1.28e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2151 on 2155 degrees of freedom
Multiple R-squared: 0.6256, Adjusted R-squared: 0.6242
F-statistic: 450.2 on 8 and 2155 DF, p-value: < 2.2e-16
plot(lm3)
anova(lm2, lm3)
Analysis of Variance Table
Model 1: average_price ~ type
Model 2: average_price ~ type + region
Res.Df RSS Df Sum of Sq F Pr(>F)
1 2162 139.19
2 2155 99.70 7 39.485 121.92 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
avocados_resid <- train %>%
add_residuals(lm3) %>%
dplyr::select(-average_price, -type, -region) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm4 <- lm(average_price ~ type + region + month, train)
summary(lm4)
Call:
lm(formula = average_price ~ type + region + month, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.54688 -0.12102 -0.00375 0.10678 0.86726
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.076014 0.017246 62.392 < 2e-16 ***
typeorganic 0.483058 0.008102 59.623 < 2e-16 ***
regionGreatLakes -0.066288 0.016345 -4.056 5.18e-05 ***
regionMidsouth -0.003844 0.016356 -0.235 0.814221
regionNortheast 0.191719 0.016475 11.637 < 2e-16 ***
regionPlains 0.030713 0.016244 1.891 0.058799 .
regionSouthCentral -0.310548 0.016311 -19.040 < 2e-16 ***
regionSoutheast -0.013073 0.016469 -0.794 0.427402
regionWest -0.134413 0.016212 -8.291 < 2e-16 ***
month2 -0.050981 0.018253 -2.793 0.005269 **
month3 0.011650 0.018077 0.644 0.519339
month4 0.035496 0.018890 1.879 0.060375 .
month5 0.021035 0.018949 1.110 0.267086
month6 0.075376 0.019690 3.828 0.000133 ***
month7 0.147845 0.018858 7.840 7.04e-15 ***
month8 0.218080 0.019229 11.341 < 2e-16 ***
month9 0.263865 0.019757 13.356 < 2e-16 ***
month10 0.266870 0.018916 14.108 < 2e-16 ***
month11 0.154964 0.019163 8.087 1.01e-15 ***
month12 0.018761 0.019291 0.972 0.330913
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1883 on 2144 degrees of freedom
Multiple R-squared: 0.7144, Adjusted R-squared: 0.7119
F-statistic: 282.3 on 19 and 2144 DF, p-value: < 2.2e-16
plot(lm4)
anova(lm3, lm4)
Analysis of Variance Table
Model 1: average_price ~ type + region
Model 2: average_price ~ type + region + month
Res.Df RSS Df Sum of Sq F Pr(>F)
1 2155 99.700
2 2144 76.047 11 23.653 60.622 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# log helps diagnostics
lm4 <- lm(log(average_price) ~ type + region + month, train)
summary(lm4)
Call:
lm(formula = log(average_price) ~ type + region + month, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.48864 -0.09065 -0.00079 0.08789 0.45193
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.054821 0.012618 4.345 1.46e-05 ***
typeorganic 0.363732 0.005928 61.359 < 2e-16 ***
regionGreatLakes -0.027175 0.011959 -2.272 0.023161 *
regionMidsouth 0.018802 0.011967 1.571 0.116300
regionNortheast 0.146238 0.012054 12.132 < 2e-16 ***
regionPlains 0.034755 0.011885 2.924 0.003490 **
regionSouthCentral -0.245445 0.011934 -20.567 < 2e-16 ***
regionSoutheast 0.002760 0.012050 0.229 0.818870
regionWest -0.106270 0.011862 -8.959 < 2e-16 ***
month2 -0.046976 0.013356 -3.517 0.000445 ***
month3 0.015634 0.013226 1.182 0.237333
month4 0.027947 0.013822 2.022 0.043303 *
month5 0.012461 0.013864 0.899 0.368874
month6 0.061402 0.014407 4.262 2.11e-05 ***
month7 0.114630 0.013798 8.308 < 2e-16 ***
month8 0.156065 0.014069 11.093 < 2e-16 ***
month9 0.182715 0.014456 12.640 < 2e-16 ***
month10 0.193309 0.013840 13.967 < 2e-16 ***
month11 0.117171 0.014021 8.357 < 2e-16 ***
month12 0.016215 0.014115 1.149 0.250778
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1378 on 2144 degrees of freedom
Multiple R-squared: 0.729, Adjusted R-squared: 0.7266
F-statistic: 303.6 on 19 and 2144 DF, p-value: < 2.2e-16
plot(lm4)
NA
avocados_resid <- train %>%
add_residuals(lm4) %>%
dplyr::select(-average_price, -type, -region, -month) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm5 <- lm(log(average_price) ~ type + region + month + year, train)
summary(lm5)
Call:
lm(formula = log(average_price) ~ type + region + month + year,
data = train)
Residuals:
Min 1Q Median 3Q Max
-0.50737 -0.07874 0.00299 0.08753 0.39464
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.014160 0.012693 1.116 0.264752
typeorganic 0.364207 0.005467 66.617 < 2e-16 ***
regionGreatLakes -0.026025 0.011030 -2.360 0.018386 *
regionMidsouth 0.021443 0.011038 1.943 0.052180 .
regionNortheast 0.145697 0.011117 13.106 < 2e-16 ***
regionPlains 0.033353 0.010961 3.043 0.002373 **
regionSouthCentral -0.245515 0.011006 -22.308 < 2e-16 ***
regionSoutheast 0.003288 0.011113 0.296 0.767358
regionWest -0.106062 0.010939 -9.695 < 2e-16 ***
month2 -0.045500 0.012323 -3.692 0.000228 ***
month3 0.021422 0.012213 1.754 0.079571 .
month4 0.039185 0.013006 3.013 0.002619 **
month5 0.038088 0.013063 2.916 0.003585 **
month6 0.079723 0.013544 5.886 4.58e-09 ***
month7 0.134623 0.012989 10.364 < 2e-16 ***
month8 0.173161 0.013243 13.075 < 2e-16 ***
month9 0.201671 0.013583 14.847 < 2e-16 ***
month10 0.208536 0.013023 16.013 < 2e-16 ***
month11 0.138760 0.013205 10.508 < 2e-16 ***
month12 0.027549 0.013273 2.076 0.038051 *
year2016 -0.028764 0.006968 -4.128 3.80e-05 ***
year2017 0.095170 0.006953 13.687 < 2e-16 ***
year2018 0.088329 0.012722 6.943 5.08e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1271 on 2141 degrees of freedom
Multiple R-squared: 0.7698, Adjusted R-squared: 0.7675
F-statistic: 325.5 on 22 and 2141 DF, p-value: < 2.2e-16
plot(lm5)
anova(lm4, lm5)
Analysis of Variance Table
Model 1: log(average_price) ~ type + region + month
Model 2: log(average_price) ~ type + region + month + year
Res.Df RSS Df Sum of Sq F Pr(>F)
1 2144 40.711
2 2141 34.576 3 6.1356 126.64 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
avocados_resid <- train %>%
add_residuals(lm5) %>%
dplyr::select(-average_price, -type, -region, -month, -year) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm6 <- lm(log(average_price) ~ type + region + month + year + x4046, train)
summary(lm6)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.48464 -0.07297 0.00623 0.08268 0.37507
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.529e-01 1.505e-02 10.163 < 2e-16 ***
typeorganic 2.596e-01 8.549e-03 30.370 < 2e-16 ***
regionGreatLakes -8.908e-02 1.124e-02 -7.925 3.64e-15 ***
regionMidsouth -3.716e-02 1.115e-02 -3.334 0.000870 ***
regionNortheast 7.083e-02 1.162e-02 6.097 1.28e-09 ***
regionPlains -2.063e-02 1.098e-02 -1.879 0.060387 .
regionSouthCentral -2.137e-01 1.065e-02 -20.075 < 2e-16 ***
regionSoutheast -9.302e-03 1.058e-02 -0.879 0.379354
regionWest -1.056e-01 1.038e-02 -10.174 < 2e-16 ***
month2 -3.590e-02 1.171e-02 -3.065 0.002205 **
month3 1.929e-02 1.159e-02 1.664 0.096272 .
month4 4.324e-02 1.235e-02 3.502 0.000471 ***
month5 4.414e-02 1.240e-02 3.558 0.000381 ***
month6 8.339e-02 1.286e-02 6.486 1.09e-10 ***
month7 1.356e-01 1.233e-02 11.000 < 2e-16 ***
month8 1.714e-01 1.257e-02 13.635 < 2e-16 ***
month9 1.948e-01 1.290e-02 15.104 < 2e-16 ***
month10 1.954e-01 1.239e-02 15.775 < 2e-16 ***
month11 1.247e-01 1.257e-02 9.922 < 2e-16 ***
month12 1.855e-02 1.261e-02 1.471 0.141462
year2016 -3.463e-02 6.624e-03 -5.227 1.89e-07 ***
year2017 9.202e-02 6.602e-03 13.937 < 2e-16 ***
year2018 9.390e-02 1.208e-02 7.773 1.18e-14 ***
x4046 -7.047e-08 4.578e-09 -15.392 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1206 on 2140 degrees of freedom
Multiple R-squared: 0.7928, Adjusted R-squared: 0.7906
F-statistic: 356 on 23 and 2140 DF, p-value: < 2.2e-16
plot(lm6)
NA
avocados_resid <- train %>%
add_residuals(lm6) %>%
dplyr::select(-average_price, -type, -region, -month, -year, -x4046) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm7 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume, train)
summary(lm7)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.49928 -0.06748 0.00721 0.07714 0.36006
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.840e-01 1.781e-02 15.942 < 2e-16 ***
typeorganic 1.443e-01 1.227e-02 11.758 < 2e-16 ***
regionGreatLakes -9.998e-02 1.088e-02 -9.192 < 2e-16 ***
regionMidsouth -6.285e-02 1.094e-02 -5.745 1.05e-08 ***
regionNortheast 8.859e-02 1.129e-02 7.844 6.81e-15 ***
regionPlains -8.379e-02 1.170e-02 -7.161 1.10e-12 ***
regionSouthCentral -2.405e-01 1.048e-02 -22.941 < 2e-16 ***
regionSoutheast -5.848e-02 1.092e-02 -5.358 9.33e-08 ***
regionWest -9.657e-02 1.004e-02 -9.618 < 2e-16 ***
month2 -2.817e-02 1.131e-02 -2.490 0.012854 *
month3 1.611e-02 1.118e-02 1.440 0.150025
month4 4.164e-02 1.191e-02 3.496 0.000481 ***
month5 5.024e-02 1.197e-02 4.196 2.83e-05 ***
month6 8.560e-02 1.240e-02 6.902 6.73e-12 ***
month7 1.361e-01 1.189e-02 11.447 < 2e-16 ***
month8 1.658e-01 1.213e-02 13.667 < 2e-16 ***
month9 1.871e-01 1.246e-02 15.021 < 2e-16 ***
month10 1.842e-01 1.198e-02 15.369 < 2e-16 ***
month11 1.120e-01 1.216e-02 9.207 < 2e-16 ***
month12 1.354e-02 1.217e-02 1.113 0.266020
year2016 -2.083e-02 6.482e-03 -3.214 0.001328 **
year2017 1.050e-01 6.450e-03 16.281 < 2e-16 ***
year2018 1.180e-01 1.181e-02 9.995 < 2e-16 ***
x4046 -1.179e-08 6.394e-09 -1.843 0.065435 .
total_volume -4.936e-08 3.890e-09 -12.690 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1163 on 2139 degrees of freedom
Multiple R-squared: 0.8073, Adjusted R-squared: 0.8051
F-statistic: 373.4 on 24 and 2139 DF, p-value: < 2.2e-16
plot(lm7)
avocados_resid <- train %>%
add_residuals(lm7) %>%
dplyr::select(-average_price, -type, -region, -month, -year, -x4046, -total_volume) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm8 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags, train)
lm8b <- lm(log(average_price)~1+type+year+region+month+total_volume+small_bags+large_bags, train)
summary(lm8)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume + large_bags, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.50015 -0.06814 0.00708 0.07650 0.35764
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.908e-01 1.868e-02 15.568 < 2e-16 ***
typeorganic 1.420e-01 1.241e-02 11.443 < 2e-16 ***
regionGreatLakes -1.039e-01 1.135e-02 -9.158 < 2e-16 ***
regionMidsouth -6.573e-02 1.119e-02 -5.874 4.93e-09 ***
regionNortheast 8.701e-02 1.137e-02 7.655 2.90e-14 ***
regionPlains -8.715e-02 1.202e-02 -7.251 5.76e-13 ***
regionSouthCentral -2.427e-01 1.064e-02 -22.819 < 2e-16 ***
regionSoutheast -6.408e-02 1.184e-02 -5.412 6.94e-08 ***
regionWest -1.037e-01 1.163e-02 -8.920 < 2e-16 ***
month2 -2.810e-02 1.131e-02 -2.484 0.013062 *
month3 1.604e-02 1.118e-02 1.434 0.151747
month4 4.139e-02 1.191e-02 3.475 0.000522 ***
month5 5.002e-02 1.197e-02 4.178 3.06e-05 ***
month6 8.568e-02 1.240e-02 6.909 6.42e-12 ***
month7 1.364e-01 1.189e-02 11.467 < 2e-16 ***
month8 1.660e-01 1.213e-02 13.683 < 2e-16 ***
month9 1.869e-01 1.246e-02 15.003 < 2e-16 ***
month10 1.834e-01 1.200e-02 15.284 < 2e-16 ***
month11 1.110e-01 1.219e-02 9.106 < 2e-16 ***
month12 1.282e-02 1.218e-02 1.052 0.292784
year2016 -2.210e-02 6.564e-03 -3.367 0.000773 ***
year2017 1.032e-01 6.612e-03 15.613 < 2e-16 ***
year2018 1.152e-01 1.202e-02 9.585 < 2e-16 ***
x4046 -1.102e-08 6.424e-09 -1.715 0.086432 .
total_volume -5.133e-08 4.212e-09 -12.187 < 2e-16 ***
large_bags 1.897e-08 1.556e-08 1.219 0.222961
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1163 on 2138 degrees of freedom
Multiple R-squared: 0.8074, Adjusted R-squared: 0.8052
F-statistic: 358.6 on 25 and 2138 DF, p-value: < 2.2e-16
plot(lm8)
summary(lm8b)
Call:
lm(formula = log(average_price) ~ 1 + type + year + region +
month + total_volume + small_bags + large_bags, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.50016 -0.06826 0.00749 0.07718 0.36209
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.993e-01 1.855e-02 16.131 < 2e-16 ***
typeorganic 1.383e-01 1.217e-02 11.362 < 2e-16 ***
year2016 -2.909e-02 6.805e-03 -4.274 2.00e-05 ***
year2017 9.381e-02 7.048e-03 13.311 < 2e-16 ***
year2018 1.027e-01 1.243e-02 8.262 2.47e-16 ***
regionGreatLakes -9.767e-02 1.120e-02 -8.722 < 2e-16 ***
regionMidsouth -6.515e-02 1.113e-02 -5.856 5.47e-09 ***
regionNortheast 9.347e-02 1.057e-02 8.846 < 2e-16 ***
regionPlains -8.692e-02 1.190e-02 -7.302 3.99e-13 ***
regionSouthCentral -2.409e-01 1.030e-02 -23.381 < 2e-16 ***
regionSoutheast -6.196e-02 1.161e-02 -5.337 1.04e-07 ***
regionWest -9.988e-02 1.163e-02 -8.588 < 2e-16 ***
month2 -2.775e-02 1.128e-02 -2.461 0.0140 *
month3 1.313e-02 1.117e-02 1.176 0.2398
month4 3.853e-02 1.188e-02 3.242 0.0012 **
month5 4.795e-02 1.195e-02 4.014 6.19e-05 ***
month6 8.185e-02 1.240e-02 6.603 5.07e-11 ***
month7 1.334e-01 1.188e-02 11.229 < 2e-16 ***
month8 1.635e-01 1.210e-02 13.509 < 2e-16 ***
month9 1.853e-01 1.242e-02 14.923 < 2e-16 ***
month10 1.816e-01 1.197e-02 15.176 < 2e-16 ***
month11 1.083e-01 1.216e-02 8.901 < 2e-16 ***
month12 9.409e-03 1.217e-02 0.773 0.4396
total_volume -6.673e-08 3.906e-09 -17.084 < 2e-16 ***
small_bags 5.189e-08 1.263e-08 4.107 4.16e-05 ***
large_bags 1.890e-08 1.545e-08 1.224 0.2213
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1159 on 2138 degrees of freedom
Multiple R-squared: 0.8087, Adjusted R-squared: 0.8064
F-statistic: 361.5 on 25 and 2138 DF, p-value: < 2.2e-16
plot(lm8b)
avocados_resid <- train %>%
add_residuals(lm8) %>%
dplyr::select(-average_price, -type, -region, -month, -year, -x4046, -total_volume, -large_bags) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm9 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags, train)
summary(lm9)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume + large_bags, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.50015 -0.06814 0.00708 0.07650 0.35764
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.908e-01 1.868e-02 15.568 < 2e-16 ***
typeorganic 1.420e-01 1.241e-02 11.443 < 2e-16 ***
regionGreatLakes -1.039e-01 1.135e-02 -9.158 < 2e-16 ***
regionMidsouth -6.573e-02 1.119e-02 -5.874 4.93e-09 ***
regionNortheast 8.701e-02 1.137e-02 7.655 2.90e-14 ***
regionPlains -8.715e-02 1.202e-02 -7.251 5.76e-13 ***
regionSouthCentral -2.427e-01 1.064e-02 -22.819 < 2e-16 ***
regionSoutheast -6.408e-02 1.184e-02 -5.412 6.94e-08 ***
regionWest -1.037e-01 1.163e-02 -8.920 < 2e-16 ***
month2 -2.810e-02 1.131e-02 -2.484 0.013062 *
month3 1.604e-02 1.118e-02 1.434 0.151747
month4 4.139e-02 1.191e-02 3.475 0.000522 ***
month5 5.002e-02 1.197e-02 4.178 3.06e-05 ***
month6 8.568e-02 1.240e-02 6.909 6.42e-12 ***
month7 1.364e-01 1.189e-02 11.467 < 2e-16 ***
month8 1.660e-01 1.213e-02 13.683 < 2e-16 ***
month9 1.869e-01 1.246e-02 15.003 < 2e-16 ***
month10 1.834e-01 1.200e-02 15.284 < 2e-16 ***
month11 1.110e-01 1.219e-02 9.106 < 2e-16 ***
month12 1.282e-02 1.218e-02 1.052 0.292784
year2016 -2.210e-02 6.564e-03 -3.367 0.000773 ***
year2017 1.032e-01 6.612e-03 15.613 < 2e-16 ***
year2018 1.152e-01 1.202e-02 9.585 < 2e-16 ***
x4046 -1.102e-08 6.424e-09 -1.715 0.086432 .
total_volume -5.133e-08 4.212e-09 -12.187 < 2e-16 ***
large_bags 1.897e-08 1.556e-08 1.219 0.222961
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1163 on 2138 degrees of freedom
Multiple R-squared: 0.8074, Adjusted R-squared: 0.8052
F-statistic: 358.6 on 25 and 2138 DF, p-value: < 2.2e-16
plot(lm9)
lm10 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags + type:total_volume, train)
summary(lm10)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume + large_bags + type:total_volume, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.40965 -0.06135 0.00169 0.06239 0.66682
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.951e-01 1.641e-02 24.086 < 2e-16 ***
typeorganic 2.021e-01 1.083e-02 18.663 < 2e-16 ***
regionGreatLakes -1.327e-01 9.760e-03 -13.594 < 2e-16 ***
regionMidsouth -1.250e-01 9.802e-03 -12.753 < 2e-16 ***
regionNortheast 8.775e-02 9.721e-03 9.026 < 2e-16 ***
regionPlains -2.177e-01 1.129e-02 -19.293 < 2e-16 ***
regionSouthCentral -2.844e-01 9.219e-03 -30.854 < 2e-16 ***
regionSoutheast -1.638e-01 1.073e-02 -15.262 < 2e-16 ***
regionWest -2.152e-02 1.037e-02 -2.076 0.038055 *
month2 -6.027e-03 9.708e-03 -0.621 0.534790
month3 3.897e-02 9.600e-03 4.059 5.10e-05 ***
month4 7.302e-02 1.025e-02 7.124 1.42e-12 ***
month5 9.117e-02 1.035e-02 8.812 < 2e-16 ***
month6 1.098e-01 1.064e-02 10.316 < 2e-16 ***
month7 1.510e-01 1.018e-02 14.823 < 2e-16 ***
month8 1.730e-01 1.038e-02 16.667 < 2e-16 ***
month9 1.893e-01 1.065e-02 17.768 < 2e-16 ***
month10 1.768e-01 1.026e-02 17.227 < 2e-16 ***
month11 1.034e-01 1.043e-02 9.919 < 2e-16 ***
month12 2.169e-02 1.042e-02 2.081 0.037565 *
year2016 2.092e-02 5.820e-03 3.595 0.000332 ***
year2017 1.678e-01 6.106e-03 27.475 < 2e-16 ***
year2018 2.330e-01 1.111e-02 20.977 < 2e-16 ***
x4046 2.568e-08 5.648e-09 4.547 5.74e-06 ***
total_volume -9.083e-08 3.868e-09 -23.481 < 2e-16 ***
large_bags -3.017e-08 1.342e-08 -2.248 0.024694 *
typeorganic:total_volume -1.488e-06 5.309e-08 -28.032 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09949 on 2137 degrees of freedom
Multiple R-squared: 0.8592, Adjusted R-squared: 0.8575
F-statistic: 501.6 on 26 and 2137 DF, p-value: < 2.2e-16
plot(lm10)
lm11 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags + type:total_volume + type:region, train)
lm12 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags + type:total_volume + type:region + type:year + region:total_volume, train)
summary(lm11)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume + large_bags + type:total_volume + type:region,
data = train)
Residuals:
Min 1Q Median 3Q Max
-0.40733 -0.05652 0.00533 0.05971 0.71079
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.697e-01 2.595e-02 21.956 < 2e-16 ***
typeorganic 2.742e-02 2.660e-02 1.031 0.302616
regionGreatLakes -2.170e-01 1.798e-02 -12.070 < 2e-16 ***
regionMidsouth -2.286e-01 1.778e-02 -12.852 < 2e-16 ***
regionNortheast 6.700e-03 2.023e-02 0.331 0.740587
regionPlains -3.982e-01 2.004e-02 -19.872 < 2e-16 ***
regionSouthCentral -2.632e-01 1.549e-02 -16.988 < 2e-16 ***
regionSoutheast -2.212e-01 1.897e-02 -11.660 < 2e-16 ***
regionWest -9.722e-02 1.625e-02 -5.983 2.56e-09 ***
month2 1.717e-03 9.194e-03 0.187 0.851907
month3 3.929e-02 9.064e-03 4.334 1.53e-05 ***
month4 7.673e-02 9.686e-03 7.922 3.72e-15 ***
month5 1.005e-01 9.797e-03 10.261 < 2e-16 ***
month6 1.165e-01 1.007e-02 11.569 < 2e-16 ***
month7 1.506e-01 9.611e-03 15.665 < 2e-16 ***
month8 1.707e-01 9.803e-03 17.411 < 2e-16 ***
month9 1.838e-01 1.009e-02 18.215 < 2e-16 ***
month10 1.622e-01 9.804e-03 16.539 < 2e-16 ***
month11 9.294e-02 9.927e-03 9.362 < 2e-16 ***
month12 1.435e-02 9.870e-03 1.454 0.146176
year2016 2.238e-02 5.545e-03 4.036 5.64e-05 ***
year2017 1.694e-01 5.822e-03 29.092 < 2e-16 ***
year2018 2.431e-01 1.056e-02 23.020 < 2e-16 ***
x4046 9.954e-09 1.040e-08 0.957 0.338432
total_volume -1.128e-07 5.639e-09 -20.004 < 2e-16 ***
large_bags 1.579e-08 1.667e-08 0.947 0.343529
typeorganic:total_volume -1.547e-06 5.493e-08 -28.162 < 2e-16 ***
typeorganic:regionGreatLakes 7.198e-02 2.114e-02 3.405 0.000675 ***
typeorganic:regionMidsouth 1.084e-01 2.108e-02 5.142 2.97e-07 ***
typeorganic:regionNortheast 8.498e-02 2.317e-02 3.668 0.000251 ***
typeorganic:regionPlains 2.464e-01 2.329e-02 10.581 < 2e-16 ***
typeorganic:regionSouthCentral -4.427e-02 1.942e-02 -2.279 0.022747 *
typeorganic:regionSoutheast 3.479e-02 2.255e-02 1.543 0.123010
typeorganic:regionWest 1.267e-01 1.948e-02 6.503 9.78e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0938 on 2130 degrees of freedom
Multiple R-squared: 0.8753, Adjusted R-squared: 0.8733
F-statistic: 452.9 on 33 and 2130 DF, p-value: < 2.2e-16
plot(lm11)
glance(lm11)
glance(lm12)
train_rmse <- train %>%
add_residuals(lm12) %>%
mutate(sq_resid = resid^2) %>%
summarise(mse = mean(sq_resid),
rmse = mse^0.5) %>%
pull(rmse)
train_rmse
[1] 0.08717599
predictions_test <- test %>%
add_predictions(lm12) %>%
add_residuals(lm12) %>%
dplyr::select(average_price, pred, resid)
test_rsme <- predictions_test %>%
mutate(sq_resid = resid^2) %>%
summarise(mse = mean(sq_resid),
rmse = mse^0.5) %>%
pull(rmse)
test_rsme / train_rmse
[1] 1.017817
cv_10_fold <- trainControl(
method = "cv",
number = 10,
savePredictions = TRUE
)
model2 <- train(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags + type:total_volume,
avocados,
trControl = cv_10_fold,
method = "lm")
model2$resample %>%
summarise(av_r2 = mean(Rsquared),
av_rmse = mean(RMSE))
cv_10_fold <- trainControl(
method = "cv",
number = 10,
savePredictions = TRUE
)
model2 <- train(log(average_price) ~ type + region + month + year + x4046+ total_volume + large_bags + type:total_volume + type:region + type:year + region:total_volume,
avocados,
trControl = cv_10_fold,
method = "lm")
model2$resample %>%
summarise(av_r2 = mean(Rsquared),
av_rmse = mean(RMSE))
calc.relimp(lm9, type = "lmg", rela = TRUE)
Response variable: log(average_price)
Total response variance: 0.0694549
Analysis based on 2164 observations
25 Regressors:
Some regressors combined in groups:
Group region : regionGreatLakes regionMidsouth regionNortheast regionPlains regionSouthCentral regionSoutheast regionWest
Group month : month2 month3 month4 month5 month6 month7 month8 month9 month10 month11 month12
Group year : year2016 year2017 year2018
Relative importance of 7 (groups of) regressors assessed:
region month year type x4046 total_volume large_bags
Proportion of variance explained by model: 80.74%
Metrics are normalized to sum to 100% (rela=TRUE).
Relative importance metrics:
lmg
region 0.13937651
month 0.08720089
year 0.05339973
type 0.20057461
x4046 0.21021625
total_volume 0.24157541
large_bags 0.06765660
Average coefficients for different model sizes:
1group 2groups 3groups 4groups 5groups 6groups 7groups
type 3.650626e-01 2.818672e-01 2.268670e-01 1.923404e-01 1.708109e-01 1.557828e-01 1.420239e-01
regionGreatLakes -3.105644e-02 -6.536054e-02 -9.093596e-02 -1.060173e-01 -1.109577e-01 -1.086055e-01 -1.039330e-01
regionMidsouth 2.355831e-02 -2.252880e-02 -5.146003e-02 -6.662130e-02 -7.149307e-02 -6.993080e-02 -6.573028e-02
regionNortheast 1.455191e-01 1.028825e-01 7.976257e-02 7.198282e-02 7.456472e-02 8.161071e-02 8.701139e-02
regionPlains 2.070846e-02 -2.855559e-02 -6.105776e-02 -8.005414e-02 -8.894825e-02 -9.069393e-02 -8.714929e-02
regionSouthCentral -2.536335e-01 -2.280866e-01 -2.218658e-01 -2.263421e-01 -2.342857e-01 -2.405727e-01 -2.427310e-01
regionSoutheast 3.531651e-03 -5.199057e-03 -2.416280e-02 -4.418394e-02 -5.879663e-02 -6.513071e-02 -6.407696e-02
regionWest -1.085448e-01 -7.199524e-02 -7.201130e-02 -8.630120e-02 -9.923357e-02 -1.040612e-01 -1.037239e-01
month2 -5.186263e-02 -3.902048e-02 -3.161419e-02 -2.844915e-02 -2.789532e-02 -2.815423e-02 -2.810317e-02
month3 1.695275e-02 1.387872e-02 1.331812e-02 1.377802e-02 1.445124e-02 1.523128e-02 1.603579e-02
month4 2.661084e-02 2.808541e-02 3.189449e-02 3.549137e-02 3.784346e-02 3.968660e-02 4.138582e-02
month5 8.649173e-03 1.885438e-02 2.869720e-02 3.626468e-02 4.127649e-02 4.559877e-02 5.002444e-02
month6 4.495922e-02 5.462191e-02 6.464176e-02 7.263623e-02 7.786383e-02 8.187851e-02 8.568132e-02
month7 1.092868e-01 1.106613e-01 1.163355e-01 1.227247e-01 1.278965e-01 1.322363e-01 1.363588e-01
month8 1.587040e-01 1.504023e-01 1.509025e-01 1.549697e-01 1.591645e-01 1.628472e-01 1.659996e-01
month9 1.794670e-01 1.713216e-01 1.709562e-01 1.742895e-01 1.785286e-01 1.829385e-01 1.868981e-01
month10 1.925655e-01 1.803850e-01 1.756806e-01 1.754526e-01 1.774417e-01 1.806015e-01 1.833873e-01
month11 1.160070e-01 1.026353e-01 9.823998e-02 9.898124e-02 1.021506e-01 1.067620e-01 1.109910e-01
month12 1.376898e-02 6.654850e-03 4.376123e-03 4.940819e-03 6.959231e-03 1.001879e-02 1.282026e-02
year2016 -3.579802e-02 -2.461379e-02 -2.161431e-02 -2.253051e-02 -2.362133e-02 -2.302443e-02 -2.210155e-02
year2017 8.610992e-02 1.014739e-01 1.062426e-01 1.053060e-01 1.033841e-01 1.031367e-01 1.032399e-01
year2018 -7.601043e-03 4.083933e-02 6.759340e-02 8.145847e-02 9.148388e-02 1.032011e-01 1.152293e-01
x4046 -1.813739e-07 -1.520503e-07 -1.205773e-07 -8.851581e-08 -5.791840e-08 -3.127724e-08 -1.101968e-08
total_volume -8.316772e-08 -7.559854e-08 -6.875720e-08 -6.279969e-08 -5.790218e-08 -5.412286e-08 -5.133306e-08
large_bags -4.656543e-07 -2.851997e-07 -1.487453e-07 -5.749024e-08 -7.636612e-09 1.164011e-08 1.897008e-08
AUTOMATED
#### GARBAGE
regsubset_forwards <- regsubsets(average_price ~ .,
data = avocados,
nvmax = 40,
method = "forward")
summary(regsubset_forwards)
Subset selection object
Call: regsubsets.formula(average_price ~ ., data = avocados, nvmax = 40,
method = "forward")
28 Variables (and intercept)
Forced in Forced out
total_volume FALSE FALSE
x4046 FALSE FALSE
x4225 FALSE FALSE
x4770 FALSE FALSE
small_bags FALSE FALSE
large_bags FALSE FALSE
typeorganic FALSE FALSE
year2016 FALSE FALSE
year2017 FALSE FALSE
year2018 FALSE FALSE
regionGreatLakes FALSE FALSE
regionMidsouth FALSE FALSE
regionNortheast FALSE FALSE
regionPlains FALSE FALSE
regionSouthCentral FALSE FALSE
regionSoutheast FALSE FALSE
regionWest FALSE FALSE
month2 FALSE FALSE
month3 FALSE FALSE
month4 FALSE FALSE
month5 FALSE FALSE
month6 FALSE FALSE
month7 FALSE FALSE
month8 FALSE FALSE
month9 FALSE FALSE
month10 FALSE FALSE
month11 FALSE FALSE
month12 FALSE FALSE
1 subsets of each size up to 28
Selection Algorithm: forward
total_volume x4046 x4225 x4770 small_bags large_bags typeorganic year2016 year2017 year2018
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" " " " " " " " " " " " " " " "*" " "
4 ( 1 ) "*" " " " " " " " " " " " " " " "*" " "
5 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
6 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
7 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
8 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
9 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
10 ( 1 ) "*" " " " " " " " " " " "*" " " "*" " "
11 ( 1 ) "*" " " " " " " " " " " "*" " " "*" "*"
12 ( 1 ) "*" " " " " " " " " " " "*" " " "*" "*"
13 ( 1 ) "*" " " " " " " " " " " "*" " " "*" "*"
14 ( 1 ) "*" " " " " " " " " " " "*" " " "*" "*"
15 ( 1 ) "*" " " " " " " " " " " "*" " " "*" "*"
16 ( 1 ) "*" " " " " "*" " " " " "*" " " "*" "*"
17 ( 1 ) "*" " " " " "*" " " " " "*" "*" "*" "*"
18 ( 1 ) "*" " " "*" "*" " " " " "*" "*" "*" "*"
19 ( 1 ) "*" " " "*" "*" " " " " "*" "*" "*" "*"
20 ( 1 ) "*" " " "*" "*" " " " " "*" "*" "*" "*"
21 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*"
22 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*"
23 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*"
24 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*"
25 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*"
26 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
27 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
28 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
regionGreatLakes regionMidsouth regionNortheast regionPlains regionSouthCentral regionSoutheast regionWest
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " "*" " " " " " " " "
3 ( 1 ) " " " " "*" " " " " " " " "
4 ( 1 ) " " " " "*" " " "*" " " " "
5 ( 1 ) " " " " "*" " " "*" " " " "
6 ( 1 ) " " " " "*" " " "*" " " " "
7 ( 1 ) " " " " "*" " " "*" " " " "
8 ( 1 ) " " " " "*" " " "*" " " " "
9 ( 1 ) " " " " "*" " " "*" " " " "
10 ( 1 ) " " " " "*" " " "*" " " " "
11 ( 1 ) " " " " "*" " " "*" " " " "
12 ( 1 ) " " " " "*" " " "*" " " " "
13 ( 1 ) "*" " " "*" " " "*" " " " "
14 ( 1 ) "*" " " "*" " " "*" " " "*"
15 ( 1 ) "*" " " "*" " " "*" " " "*"
16 ( 1 ) "*" " " "*" " " "*" " " "*"
17 ( 1 ) "*" " " "*" " " "*" " " "*"
18 ( 1 ) "*" " " "*" " " "*" " " "*"
19 ( 1 ) "*" " " "*" " " "*" " " "*"
20 ( 1 ) "*" " " "*" " " "*" " " "*"
21 ( 1 ) "*" " " "*" " " "*" " " "*"
22 ( 1 ) "*" "*" "*" " " "*" " " "*"
23 ( 1 ) "*" "*" "*" "*" "*" " " "*"
24 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
25 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
26 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
27 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
28 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
month2 month3 month4 month5 month6 month7 month8 month9 month10 month11 month12
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
6 ( 1 ) " " " " " " " " " " " " " " " " "*" " " " "
7 ( 1 ) " " " " " " " " " " " " " " "*" "*" " " " "
8 ( 1 ) " " " " " " " " " " " " "*" "*" "*" " " " "
9 ( 1 ) " " " " " " " " " " "*" "*" "*" "*" " " " "
10 ( 1 ) " " " " " " " " " " "*" "*" "*" "*" "*" " "
11 ( 1 ) " " " " " " " " " " "*" "*" "*" "*" "*" " "
12 ( 1 ) " " " " " " " " "*" "*" "*" "*" "*" "*" " "
13 ( 1 ) " " " " " " " " "*" "*" "*" "*" "*" "*" " "
14 ( 1 ) " " " " " " " " "*" "*" "*" "*" "*" "*" " "
15 ( 1 ) "*" " " " " " " "*" "*" "*" "*" "*" "*" " "
16 ( 1 ) "*" " " " " " " "*" "*" "*" "*" "*" "*" " "
17 ( 1 ) "*" " " " " " " "*" "*" "*" "*" "*" "*" " "
18 ( 1 ) "*" " " " " " " "*" "*" "*" "*" "*" "*" " "
19 ( 1 ) "*" " " " " "*" "*" "*" "*" "*" "*" "*" " "
20 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
21 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
22 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
23 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
24 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
25 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
26 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
27 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
28 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(regsubset_forwards,
scale = "bic")
sum_forward <- summary(regsubset_forwards)
plot(sum_forward$bic,
type = "b")
# glmulti
glmulti_fit <- glmulti(
log(average_price) ~ ., # model to fit, in this case, charges varies with everything
level = 2, # level = 2 means try pairwise interactions. level = 1 means main effects only
data = train, # data to use for fitting
minsize = 0, # min size of model to try, in number of predictors
maxsize = -1, # max size to try, set to -1 for unlimited
marginality = TRUE, # marginality true means include pairwise interaction only if both main effects present in model.
method = "d", # method "d" means trial run, to get size of problem. Set to "h" for exhaustive search, or "g" for genetic algorithm
confsetsize = 10, # how many models should glmulti() return? Must be less than total size of problem
plotty = FALSE, # provide progress plots? Generally annoying.
report = TRUE, # provide progress reports? Generally useful.
fitfunction = lm, # use lm() as fit function. Can also use glm() for logistic regression.
crit = aic # criterion for selecting best models.
)
Initialization...
TASK: Diagnostic of candidate set.
Sample size: 2164
4 factor(s).
7 covariate(s).
0 f exclusion(s).
0 c exclusion(s).
0 f:f exclusion(s).
0 c:c exclusion(s).
0 f:c exclusion(s).
Size constraints: min = 0 max = -1
Complexity constraints: min = 0 max = -1
Marginality rule.
Your candidate set contains more than 1 billion (1e9) models.
lm_multi <- lm(log(average_price)~1+type+year+region+month+total_volume+x4046+x4225+x4770+small_bags+large_bags+year:type+region:type+region:year+month:type+month:year+month:region+x4046:total_volume+x4225:total_volume+small_bags:total_volume+small_bags:x4046+small_bags:x4225+small_bags:x4770+large_bags:total_volume+large_bags:small_bags+type:total_volume+type:x4225+year:total_volume+year:x4046+year:x4770+year:large_bags+region:total_volume+region:x4225+region:large_bags+month:total_volume+month:x4770+month:small_bags, train)
summary(lm_multi)
Call:
lm(formula = log(average_price) ~ 1 + type + year + region +
month + total_volume + x4046 + x4225 + x4770 + small_bags +
large_bags + year:type + region:type + region:year + month:type +
month:year + month:region + x4046:total_volume + x4225:total_volume +
small_bags:total_volume + small_bags:x4046 + small_bags:x4225 +
small_bags:x4770 + large_bags:total_volume + large_bags:small_bags +
type:total_volume + type:x4225 + year:total_volume + year:x4046 +
year:x4770 + year:large_bags + region:total_volume + region:x4225 +
region:large_bags + month:total_volume + month:x4770 + month:small_bags,
data = train)
Residuals:
Min 1Q Median 3Q Max
-0.25806 -0.03281 0.00058 0.03568 0.34184
Coefficients: (9 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.255e-01 8.460e-02 7.393 2.13e-13 ***
typeorganic -8.939e-02 8.080e-02 -1.106 0.268751
year2016 2.364e-01 3.002e-02 7.874 5.71e-15 ***
year2017 2.824e-01 3.157e-02 8.944 < 2e-16 ***
year2018 5.397e-01 5.018e-02 10.756 < 2e-16 ***
regionGreatLakes -9.074e-02 7.469e-02 -1.215 0.224568
regionMidsouth -2.542e-01 7.638e-02 -3.328 0.000892 ***
regionNortheast -5.563e-02 8.342e-02 -0.667 0.504925
regionPlains -1.702e-01 8.056e-02 -2.113 0.034726 *
regionSouthCentral -2.062e-01 7.704e-02 -2.677 0.007497 **
regionSoutheast -5.594e-02 7.423e-02 -0.754 0.451143
regionWest 5.367e-02 7.567e-02 0.709 0.478286
month2 5.724e-02 3.871e-02 1.479 0.139351
month3 1.440e-01 3.920e-02 3.675 0.000245 ***
month4 1.436e-01 4.212e-02 3.410 0.000664 ***
month5 1.214e-01 4.336e-02 2.799 0.005176 **
month6 1.498e-01 4.709e-02 3.180 0.001495 **
month7 1.722e-01 4.461e-02 3.860 0.000117 ***
month8 1.950e-01 4.713e-02 4.136 3.68e-05 ***
month9 7.187e-02 4.665e-02 1.541 0.123600
month10 2.078e-01 4.376e-02 4.748 2.21e-06 ***
month11 5.381e-02 4.353e-02 1.236 0.216574
month12 -7.273e-02 4.378e-02 -1.661 0.096787 .
total_volume 1.854e-07 1.676e-07 1.106 0.268789
x4046 -3.609e-07 1.706e-07 -2.115 0.034527 *
x4225 -4.336e-07 1.732e-07 -2.504 0.012379 *
x4770 -2.602e-07 2.102e-07 -1.238 0.215938
small_bags -1.680e-07 1.696e-07 -0.991 0.321969
large_bags -1.011e-06 2.116e-07 -4.780 1.88e-06 ***
typeorganic:year2016 -2.288e-01 2.209e-02 -10.356 < 2e-16 ***
typeorganic:year2017 -2.323e-01 2.354e-02 -9.870 < 2e-16 ***
typeorganic:year2018 -2.189e-01 4.109e-02 -5.326 1.12e-07 ***
typeorganic:regionGreatLakes 3.226e-02 6.946e-02 0.464 0.642406
typeorganic:regionMidsouth 2.561e-01 7.180e-02 3.566 0.000371 ***
typeorganic:regionNortheast 2.220e-01 7.880e-02 2.818 0.004883 **
typeorganic:regionPlains 2.461e-01 7.620e-02 3.229 0.001263 **
typeorganic:regionSouthCentral 8.328e-02 7.340e-02 1.135 0.256716
typeorganic:regionSoutheast 1.232e-01 7.001e-02 1.759 0.078662 .
typeorganic:regionWest 3.204e-02 7.012e-02 0.457 0.647763
year2016:regionGreatLakes -6.432e-02 1.707e-02 -3.767 0.000170 ***
year2017:regionGreatLakes -3.119e-02 1.838e-02 -1.697 0.089933 .
year2018:regionGreatLakes -5.608e-02 3.125e-02 -1.795 0.072868 .
year2016:regionMidsouth -2.708e-02 1.651e-02 -1.640 0.101071
year2017:regionMidsouth 4.999e-03 1.702e-02 0.294 0.768955
year2018:regionMidsouth -7.916e-02 3.113e-02 -2.543 0.011064 *
year2016:regionNortheast 5.951e-02 1.757e-02 3.387 0.000721 ***
year2017:regionNortheast 9.892e-02 1.898e-02 5.211 2.08e-07 ***
year2018:regionNortheast 6.028e-02 3.386e-02 1.780 0.075210 .
year2016:regionPlains -1.365e-01 1.777e-02 -7.681 2.51e-14 ***
year2017:regionPlains -1.580e-01 1.763e-02 -8.966 < 2e-16 ***
year2018:regionPlains -2.518e-01 3.055e-02 -8.243 3.09e-16 ***
year2016:regionSouthCentral -8.456e-02 1.685e-02 -5.019 5.67e-07 ***
year2017:regionSouthCentral -7.148e-02 1.688e-02 -4.234 2.41e-05 ***
year2018:regionSouthCentral -8.377e-02 2.987e-02 -2.804 0.005093 **
year2016:regionSoutheast -1.538e-01 1.765e-02 -8.716 < 2e-16 ***
year2017:regionSoutheast -1.358e-01 1.726e-02 -7.868 6.00e-15 ***
year2018:regionSoutheast -2.670e-01 3.084e-02 -8.657 < 2e-16 ***
year2016:regionWest -4.118e-02 1.835e-02 -2.244 0.024977 *
year2017:regionWest 2.351e-02 1.921e-02 1.224 0.221023
year2018:regionWest 2.031e-02 3.256e-02 0.624 0.532910
typeorganic:month2 -3.672e-02 3.077e-02 -1.193 0.232840
typeorganic:month3 -9.784e-02 3.119e-02 -3.137 0.001733 **
typeorganic:month4 -7.267e-02 3.374e-02 -2.154 0.031386 *
typeorganic:month5 -6.724e-02 3.503e-02 -1.920 0.055054 .
typeorganic:month6 -6.012e-02 3.756e-02 -1.601 0.109620
typeorganic:month7 -4.639e-02 3.450e-02 -1.345 0.178859
typeorganic:month8 -9.407e-03 3.588e-02 -0.262 0.793188
typeorganic:month9 7.776e-02 3.586e-02 2.168 0.030255 *
typeorganic:month10 -6.154e-02 3.350e-02 -1.837 0.066352 .
typeorganic:month11 1.436e-02 3.414e-02 0.421 0.673979
typeorganic:month12 6.533e-02 3.386e-02 1.929 0.053820 .
year2016:month2 8.061e-03 1.793e-02 0.450 0.653024
year2017:month2 -1.553e-02 1.825e-02 -0.851 0.394887
year2018:month2 -3.545e-02 1.906e-02 -1.860 0.063085 .
year2016:month3 -1.648e-03 1.733e-02 -0.095 0.924277
year2017:month3 7.285e-02 1.770e-02 4.114 4.04e-05 ***
year2018:month3 -3.362e-02 1.928e-02 -1.743 0.081460 .
year2016:month4 -2.984e-02 1.788e-02 -1.669 0.095291 .
year2017:month4 1.326e-01 1.829e-02 7.249 6.06e-13 ***
year2018:month4 NA NA NA NA
year2016:month5 -2.726e-02 1.697e-02 -1.606 0.108426
year2017:month5 1.852e-01 1.903e-02 9.733 < 2e-16 ***
year2018:month5 NA NA NA NA
year2016:month6 1.813e-03 1.784e-02 0.102 0.919084
year2017:month6 1.589e-01 1.903e-02 8.352 < 2e-16 ***
year2018:month6 NA NA NA NA
year2016:month7 8.302e-02 1.814e-02 4.576 5.05e-06 ***
year2017:month7 1.712e-01 1.823e-02 9.395 < 2e-16 ***
year2018:month7 NA NA NA NA
year2016:month8 6.478e-02 1.910e-02 3.392 0.000708 ***
year2017:month8 2.101e-01 1.835e-02 11.453 < 2e-16 ***
year2018:month8 NA NA NA NA
year2016:month9 6.232e-02 2.017e-02 3.089 0.002037 **
year2017:month9 2.689e-01 1.995e-02 13.478 < 2e-16 ***
year2018:month9 NA NA NA NA
year2016:month10 9.812e-02 1.890e-02 5.191 2.31e-07 ***
year2017:month10 2.455e-01 1.893e-02 12.968 < 2e-16 ***
year2018:month10 NA NA NA NA
year2016:month11 1.926e-01 1.829e-02 10.531 < 2e-16 ***
year2017:month11 1.938e-01 1.894e-02 10.232 < 2e-16 ***
year2018:month11 NA NA NA NA
year2016:month12 6.625e-02 1.841e-02 3.598 0.000328 ***
year2017:month12 1.472e-01 1.817e-02 8.100 9.70e-16 ***
year2018:month12 NA NA NA NA
regionGreatLakes:month2 -5.983e-02 2.639e-02 -2.268 0.023463 *
regionMidsouth:month2 -4.433e-02 2.681e-02 -1.654 0.098383 .
regionNortheast:month2 -9.789e-03 2.630e-02 -0.372 0.709809
regionPlains:month2 -4.942e-02 2.792e-02 -1.770 0.076900 .
regionSouthCentral:month2 8.683e-04 2.563e-02 0.034 0.972976
regionSoutheast:month2 -4.947e-02 2.683e-02 -1.844 0.065389 .
regionWest:month2 1.564e-02 2.478e-02 0.631 0.528005
regionGreatLakes:month3 -5.716e-02 2.627e-02 -2.175 0.029715 *
regionMidsouth:month3 -9.828e-02 2.602e-02 -3.777 0.000164 ***
regionNortheast:month3 -4.060e-02 2.515e-02 -1.614 0.106627
regionPlains:month3 -8.950e-02 2.762e-02 -3.241 0.001213 **
regionSouthCentral:month3 -1.596e-02 2.432e-02 -0.656 0.511742
regionSoutheast:month3 -7.190e-02 2.635e-02 -2.728 0.006425 **
regionWest:month3 1.306e-02 2.433e-02 0.537 0.591516
regionGreatLakes:month4 -5.804e-02 2.716e-02 -2.137 0.032732 *
regionMidsouth:month4 -7.827e-02 2.724e-02 -2.874 0.004098 **
regionNortheast:month4 -3.249e-02 2.719e-02 -1.195 0.232252
regionPlains:month4 -8.549e-02 2.988e-02 -2.861 0.004270 **
regionSouthCentral:month4 -3.315e-02 2.682e-02 -1.236 0.216650
regionSoutheast:month4 -6.484e-02 2.800e-02 -2.316 0.020670 *
regionWest:month4 2.436e-02 2.623e-02 0.929 0.353097
regionGreatLakes:month5 -4.318e-02 2.846e-02 -1.517 0.129420
regionMidsouth:month5 -7.995e-02 2.868e-02 -2.788 0.005363 **
regionNortheast:month5 -1.522e-03 2.728e-02 -0.056 0.955507
regionPlains:month5 -3.121e-02 3.027e-02 -1.031 0.302678
regionSouthCentral:month5 8.644e-05 2.747e-02 0.003 0.997490
regionSoutheast:month5 -4.897e-02 2.838e-02 -1.725 0.084609 .
regionWest:month5 6.408e-02 2.571e-02 2.493 0.012762 *
regionGreatLakes:month6 -8.487e-02 3.090e-02 -2.747 0.006079 **
regionMidsouth:month6 -9.893e-02 2.972e-02 -3.329 0.000889 ***
regionNortheast:month6 -3.149e-02 2.940e-02 -1.071 0.284196
regionPlains:month6 -7.005e-02 3.290e-02 -2.129 0.033363 *
regionSouthCentral:month6 -5.460e-02 2.846e-02 -1.919 0.055184 .
regionSoutheast:month6 -1.059e-01 3.088e-02 -3.430 0.000616 ***
regionWest:month6 2.486e-02 2.712e-02 0.917 0.359495
regionGreatLakes:month7 -1.023e-01 2.825e-02 -3.620 0.000302 ***
regionMidsouth:month7 -1.434e-01 2.859e-02 -5.017 5.73e-07 ***
regionNortheast:month7 -1.018e-01 2.822e-02 -3.606 0.000319 ***
regionPlains:month7 -1.122e-01 3.012e-02 -3.726 0.000200 ***
regionSouthCentral:month7 -7.391e-02 2.746e-02 -2.692 0.007174 **
regionSoutheast:month7 -1.532e-01 2.957e-02 -5.182 2.42e-07 ***
regionWest:month7 2.250e-02 2.613e-02 0.861 0.389365
regionGreatLakes:month8 -1.047e-01 2.939e-02 -3.563 0.000376 ***
regionMidsouth:month8 -1.354e-01 2.888e-02 -4.690 2.93e-06 ***
regionNortheast:month8 -1.813e-01 2.916e-02 -6.217 6.19e-10 ***
regionPlains:month8 -1.270e-01 3.198e-02 -3.972 7.38e-05 ***
regionSouthCentral:month8 -6.792e-02 2.865e-02 -2.370 0.017870 *
regionSoutheast:month8 -1.494e-01 2.930e-02 -5.100 3.74e-07 ***
regionWest:month8 7.398e-03 2.718e-02 0.272 0.785475
regionGreatLakes:month9 -4.479e-02 2.913e-02 -1.538 0.124330
regionMidsouth:month9 -7.889e-02 2.942e-02 -2.682 0.007391 **
regionNortheast:month9 -1.303e-01 2.950e-02 -4.417 1.05e-05 ***
regionPlains:month9 -5.813e-02 3.243e-02 -1.793 0.073200 .
regionSouthCentral:month9 -3.441e-02 2.919e-02 -1.179 0.238541
regionSoutheast:month9 -5.982e-02 2.978e-02 -2.008 0.044744 *
regionWest:month9 -5.648e-03 2.804e-02 -0.201 0.840359
regionGreatLakes:month10 -8.735e-02 2.856e-02 -3.058 0.002258 **
regionMidsouth:month10 -1.234e-01 2.785e-02 -4.431 9.92e-06 ***
regionNortheast:month10 -2.329e-01 2.773e-02 -8.399 < 2e-16 ***
regionPlains:month10 -1.379e-01 2.929e-02 -4.709 2.67e-06 ***
regionSouthCentral:month10 -1.532e-02 2.776e-02 -0.552 0.580981
regionSoutheast:month10 -1.142e-01 2.859e-02 -3.995 6.73e-05 ***
regionWest:month10 -3.187e-02 2.619e-02 -1.217 0.223842
regionGreatLakes:month11 -7.780e-02 2.847e-02 -2.733 0.006334 **
regionMidsouth:month11 -8.089e-02 2.813e-02 -2.876 0.004074 **
regionNortheast:month11 -1.287e-01 2.832e-02 -4.544 5.87e-06 ***
regionPlains:month11 -8.710e-02 3.119e-02 -2.793 0.005277 **
regionSouthCentral:month11 -1.093e-02 2.803e-02 -0.390 0.696563
regionSoutheast:month11 -1.068e-01 2.867e-02 -3.725 0.000201 ***
regionWest:month11 -2.611e-04 2.704e-02 -0.010 0.992299
regionGreatLakes:month12 -1.416e-02 2.916e-02 -0.486 0.627347
regionMidsouth:month12 -1.062e-02 2.975e-02 -0.357 0.721173
regionNortheast:month12 -2.092e-03 2.859e-02 -0.073 0.941684
regionPlains:month12 -7.905e-03 3.088e-02 -0.256 0.798004
regionSouthCentral:month12 4.044e-02 2.852e-02 1.418 0.156439
regionSoutheast:month12 -5.684e-02 2.903e-02 -1.958 0.050410 .
regionWest:month12 4.433e-02 2.700e-02 1.642 0.100839
total_volume:x4046 5.491e-15 4.480e-15 1.226 0.220527
total_volume:x4225 2.581e-14 5.199e-15 4.965 7.47e-07 ***
total_volume:small_bags -3.073e-14 1.337e-14 -2.299 0.021637 *
x4046:small_bags 3.539e-14 2.394e-14 1.478 0.139504
x4225:small_bags 2.093e-14 2.223e-14 0.941 0.346728
x4770:small_bags 1.491e-14 8.351e-14 0.179 0.858323
total_volume:large_bags 2.089e-14 8.058e-15 2.593 0.009596 **
small_bags:large_bags 1.436e-13 3.743e-14 3.835 0.000130 ***
typeorganic:total_volume -2.176e-06 7.548e-08 -28.824 < 2e-16 ***
typeorganic:x4225 2.534e-06 1.414e-07 17.923 < 2e-16 ***
year2016:total_volume -8.392e-08 8.251e-09 -10.171 < 2e-16 ***
year2017:total_volume -8.927e-08 8.699e-09 -10.263 < 2e-16 ***
year2018:total_volume -1.135e-07 1.261e-08 -9.000 < 2e-16 ***
year2016:x4046 7.846e-08 1.106e-08 7.096 1.80e-12 ***
year2017:x4046 8.744e-08 1.235e-08 7.082 1.99e-12 ***
year2018:x4046 1.099e-07 2.053e-08 5.355 9.59e-08 ***
year2016:x4770 1.065e-07 6.667e-08 1.597 0.110473
year2017:x4770 1.203e-07 9.598e-08 1.254 0.210162
year2018:x4770 3.472e-08 1.470e-07 0.236 0.813225
[ reached getOption("max.print") -- omitted 57 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0601 on 1916 degrees of freedom
Multiple R-squared: 0.9539, Adjusted R-squared: 0.948
F-statistic: 160.6 on 247 and 1916 DF, p-value: < 2.2e-16
glance(lm_multi)
train_rmse <- train %>%
add_residuals(lm_multi) %>%
mutate(sq_resid = resid^2) %>%
summarise(mse = mean(sq_resid),
rmse = mse^0.5) %>%
pull(rmse)
Warning: prediction from a rank-deficient fit may be misleading
train_rmse
[1] 0.05013213
predictions_test <- test %>%
add_predictions(lm_multi) %>%
add_residuals(lm_multi) %>%
dplyr::select(average_price, pred, resid)
Warning: prediction from a rank-deficient fit may be misleadingWarning: prediction from a rank-deficient fit may be misleading
test_rsme <- predictions_test %>%
mutate(sq_resid = resid^2) %>%
summarise(mse = mean(sq_resid),
rmse = mse^0.5) %>%
pull(rmse)
test_rsme / train_rmse
[1] 1.265112
# over fit? despite having a good bic
lm_multi <- lm(log(average_price)~ 1 + region + type + year + month + total_volume + large_bags + year:type + month:year + region:total_volume + type:total_volume, train)
summary(lm_multi)
Call:
lm(formula = log(average_price) ~ 1 + region + type + year +
month + total_volume + large_bags + year:type + month:year +
region:total_volume + type:total_volume, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.34697 -0.04625 0.00192 0.04766 0.53098
Coefficients: (9 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.663e-01 1.828e-02 25.506 < 2e-16 ***
regionGreatLakes -5.460e-02 1.121e-02 -4.871 1.19e-06 ***
regionMidsouth -5.180e-02 1.038e-02 -4.993 6.45e-07 ***
regionNortheast 1.023e-01 9.396e-03 10.884 < 2e-16 ***
regionPlains -4.999e-02 1.141e-02 -4.381 1.24e-05 ***
regionSouthCentral -2.804e-01 9.669e-03 -28.998 < 2e-16 ***
regionSoutheast -1.015e-01 1.085e-02 -9.358 < 2e-16 ***
regionWest 1.086e-01 1.271e-02 8.546 < 2e-16 ***
typeorganic 1.814e-01 1.465e-02 12.383 < 2e-16 ***
year2016 4.252e-02 1.509e-02 2.819 0.004869 **
year2017 1.042e-01 1.552e-02 6.716 2.40e-11 ***
year2018 2.433e-01 1.705e-02 14.265 < 2e-16 ***
month2 8.632e-03 1.556e-02 0.555 0.579028
month3 1.649e-02 1.472e-02 1.120 0.262703
month4 6.147e-02 1.530e-02 4.017 6.10e-05 ***
month5 5.158e-02 1.497e-02 3.446 0.000580 ***
month6 7.818e-02 1.534e-02 5.095 3.79e-07 ***
month7 8.055e-02 1.565e-02 5.148 2.88e-07 ***
month8 8.920e-02 1.428e-02 6.246 5.07e-10 ***
month9 6.977e-02 1.536e-02 4.543 5.86e-06 ***
month10 3.473e-02 1.554e-02 2.235 0.025536 *
month11 -2.938e-02 1.468e-02 -2.001 0.045483 *
month12 -3.794e-02 1.494e-02 -2.538 0.011206 *
total_volume -8.671e-08 2.850e-09 -30.428 < 2e-16 ***
large_bags -1.888e-01 1.366e-02 -13.821 < 2e-16 ***
typeorganic:year2016 -8.678e-02 8.663e-03 -10.018 < 2e-16 ***
typeorganic:year2017 -1.263e-01 9.171e-03 -13.767 < 2e-16 ***
typeorganic:year2018 -1.221e-01 1.564e-02 -7.810 8.92e-15 ***
year2016:month2 -1.740e-02 2.082e-02 -0.836 0.403495
year2017:month2 -3.230e-02 2.114e-02 -1.528 0.126740
year2018:month2 -2.678e-02 2.166e-02 -1.236 0.216488
year2016:month3 -1.764e-02 2.023e-02 -0.872 0.383462
year2017:month3 5.821e-02 2.060e-02 2.825 0.004766 **
year2018:month3 -3.319e-02 2.096e-02 -1.583 0.113533
year2016:month4 -6.828e-02 2.077e-02 -3.288 0.001025 **
year2017:month4 1.033e-01 2.047e-02 5.046 4.89e-07 ***
year2018:month4 NA NA NA NA
year2016:month5 -3.712e-02 2.012e-02 -1.845 0.065210 .
year2017:month5 1.571e-01 2.091e-02 7.512 8.57e-14 ***
year2018:month5 NA NA NA NA
year2016:month6 -2.374e-02 2.067e-02 -1.149 0.250864
year2017:month6 1.148e-01 2.102e-02 5.463 5.23e-08 ***
year2018:month6 NA NA NA NA
year2016:month7 4.701e-02 2.073e-02 2.268 0.023445 *
year2017:month7 1.328e-01 2.112e-02 6.286 3.95e-10 ***
year2018:month7 NA NA NA NA
year2016:month8 2.643e-02 2.014e-02 1.312 0.189538
year2017:month8 1.663e-01 2.044e-02 8.135 6.95e-16 ***
year2018:month8 NA NA NA NA
year2016:month9 6.379e-02 2.077e-02 3.071 0.002159 **
year2017:month9 2.513e-01 2.147e-02 11.706 < 2e-16 ***
year2018:month9 NA NA NA NA
year2016:month10 1.389e-01 2.053e-02 6.766 1.72e-11 ***
year2017:month10 2.413e-01 2.104e-02 11.464 < 2e-16 ***
year2018:month10 NA NA NA NA
year2016:month11 1.972e-01 2.048e-02 9.626 < 2e-16 ***
year2017:month11 1.663e-01 2.074e-02 8.017 1.77e-15 ***
year2018:month11 NA NA NA NA
year2016:month12 3.177e-02 2.053e-02 1.547 0.121906
year2017:month12 1.024e-01 2.050e-02 4.998 6.29e-07 ***
year2018:month12 NA NA NA NA
regionGreatLakes:total_volume -2.009e-08 3.755e-09 -5.350 9.78e-08 ***
regionMidsouth:total_volume -3.521e-08 4.501e-09 -7.822 8.14e-15 ***
regionNortheast:total_volume -9.684e-09 2.902e-09 -3.337 0.000861 ***
regionPlains:total_volume -1.384e-07 7.841e-09 -17.646 < 2e-16 ***
regionSouthCentral:total_volume 1.061e-08 2.251e-09 4.711 2.62e-06 ***
regionSoutheast:total_volume 1.130e-09 3.422e-09 0.330 0.741157
regionWest:total_volume -2.190e-08 2.301e-09 -9.519 < 2e-16 ***
typeorganic:total_volume -1.213e-06 4.779e-08 -25.372 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.07631 on 2104 degrees of freedom
Multiple R-squared: 0.9171, Adjusted R-squared: 0.9148
F-statistic: 394.6 on 59 and 2104 DF, p-value: < 2.2e-16
glance(lm_multi)
plot(lm_multi)
lm12 <- lm(log(average_price) ~ type + region + month + year + x4046 + total_volume + large_bags + type:total_volume + type:region + type:year + region:total_volume, train)
summary(lm12)
Call:
lm(formula = log(average_price) ~ type + region + month + year +
x4046 + total_volume + large_bags + type:total_volume + type:region +
type:year + region:total_volume, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.29711 -0.05424 -0.00103 0.05371 0.58560
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.040e-01 4.200e-02 12.000 < 2e-16 ***
typeorganic 1.132e-01 4.174e-02 2.712 0.006737 **
regionGreatLakes 4.623e-02 5.474e-02 0.845 0.398458
regionMidsouth -6.778e-02 5.790e-02 -1.171 0.241853
regionNortheast -9.002e-02 5.419e-02 -1.661 0.096845 .
regionPlains -1.949e-02 5.207e-02 -0.374 0.708235
regionSouthCentral -1.663e-01 6.121e-02 -2.717 0.006641 **
regionSoutheast -6.441e-02 5.020e-02 -1.283 0.199627
regionWest -1.099e-01 5.461e-02 -2.013 0.044280 *
month2 -5.992e-03 8.212e-03 -0.730 0.465728
month3 2.174e-02 8.023e-03 2.710 0.006777 **
month4 7.506e-02 8.803e-03 8.527 < 2e-16 ***
month5 9.300e-02 8.875e-03 10.479 < 2e-16 ***
month6 1.089e-01 8.942e-03 12.181 < 2e-16 ***
month7 1.411e-01 8.794e-03 16.051 < 2e-16 ***
month8 1.452e-01 8.775e-03 16.547 < 2e-16 ***
month9 1.671e-01 9.052e-03 18.459 < 2e-16 ***
month10 1.585e-01 8.883e-03 17.847 < 2e-16 ***
month11 7.760e-02 9.009e-03 8.614 < 2e-16 ***
month12 2.495e-03 8.786e-03 0.284 0.776443
year2016 9.240e-02 7.159e-03 12.907 < 2e-16 ***
year2017 2.423e-01 6.958e-03 34.822 < 2e-16 ***
year2018 3.110e-01 1.221e-02 25.475 < 2e-16 ***
x4046 4.992e-08 9.840e-09 5.073 4.26e-07 ***
total_volume -1.229e-07 7.607e-09 -16.158 < 2e-16 ***
large_bags -2.163e-01 1.476e-02 -14.657 < 2e-16 ***
typeorganic:total_volume -1.338e-06 5.432e-08 -24.634 < 2e-16 ***
typeorganic:regionGreatLakes -9.084e-02 5.398e-02 -1.683 0.092572 .
typeorganic:regionMidsouth 1.609e-02 5.713e-02 0.282 0.778277
typeorganic:regionNortheast 1.904e-01 5.361e-02 3.551 0.000393 ***
typeorganic:regionPlains -3.563e-02 5.227e-02 -0.682 0.495506
typeorganic:regionSouthCentral -1.245e-01 6.087e-02 -2.046 0.040916 *
typeorganic:regionSoutheast -4.898e-02 5.038e-02 -0.972 0.330996
typeorganic:regionWest 2.467e-01 5.423e-02 4.549 5.70e-06 ***
typeorganic:year2016 -9.931e-02 9.931e-03 -10.000 < 2e-16 ***
typeorganic:year2017 -1.291e-01 1.028e-02 -12.552 < 2e-16 ***
typeorganic:year2018 -1.448e-01 1.733e-02 -8.356 < 2e-16 ***
regionGreatLakes:total_volume -4.966e-08 1.280e-08 -3.880 0.000108 ***
regionMidsouth:total_volume -3.889e-08 1.574e-08 -2.471 0.013540 *
regionNortheast:total_volume 4.550e-08 1.119e-08 4.066 4.95e-05 ***
regionPlains:total_volume -1.954e-07 1.911e-08 -10.229 < 2e-16 ***
regionSouthCentral:total_volume -1.441e-08 1.029e-08 -1.401 0.161390
regionSoutheast:total_volume -2.368e-08 1.060e-08 -2.233 0.025684 *
regionWest:total_volume 1.558e-08 8.890e-09 1.753 0.079786 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08421 on 2120 degrees of freedom
Multiple R-squared: 0.8983, Adjusted R-squared: 0.8962
F-statistic: 435.4 on 43 and 2120 DF, p-value: < 2.2e-16
glance(lm12)
plot(lm12)
avocados_1%>%
dplyr::select(average_price, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_1 %>%
dplyr::select(is.numeric) %>%
ggpairs()
lm4 <- lm(average_price ~ type + region + month, train)
summary(lm4)
plot(lm4)
avocados_resid <- train %>%
add_residuals(lm5) %>%
dplyr::select(-average_price, -type, -region, -month) %>%
dplyr::select(resid, everything())
avocados_resid %>%
dplyr::select(resid, is.factor) %>%
ggpairs()
# month, type and region all quite strong
avocados_resid %>%
dplyr::select(is.numeric) %>%
ggpairs()